1 Getting ready

1.1 Load packages

library(here) # A Simpler Way to Find Your Files, CRAN v1.0.1
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0 
library(qiime2R) # Import qiime2 artifacts to R, [github::jbisanz/qiime2R] v0.99.35  
library(phyloseq) # Handling and analysis of high-throughput microbiome census data, Bioconductor v1.34.0 
library(microbiome) # Microbiome Analytics, Bioconductor v1.12.0
library(picante) # Integrating Phylogenies and Ecology, CRAN v1.8.2 
library(MicrobeR) # Handy functions for microbiome analysis in R, [github::jbisanz/MicrobeR] v0.3.2 
library(mixOmics) # Omics Data Integration, Bioconductor v6.14.0 
library(RUVSeq) # Remove Unwanted Variation from RNA-Seq Data, Bioconductor v1.24.0
library(sva) # Surrogate Variable Analysis, Bioconductor v3.38.0 
library(cowplot) # Streamlined Plot Theme and Plot Annotations for 'ggplot2', CRAN v1.1.1  
library(ggpubr) # 'ggplot2' Based Publication Ready Plots, CRAN v0.4.0
library(ggh4x) # Hacks for 'ggplot2', CRAN v0.1.2.1 
library(plotly) # Create Interactive Web Graphics via 'plotly.js', CRAN v4.9.3
library(RColorBrewer) # ColorBrewer Palettes, CRAN v1.1-2 
library(vegan) # Community Ecology Package, CRAN v2.5-7 

1.2 Import data

# metadata
mtd <- read_tsv(here("data/metadata.tsv"), comment = "#q2") %>%
  rename(SampleID = "#SampleID") %>% 
  column_to_rownames("SampleID") %>% 
  mutate(PCRBatch   = factor(PCRBatch),
         Diet = factor(Diet, levels = c("REF", "IM")),
         Compartment = factor(Compartment, levels = c("PI", "DI")),
         SampleOrigin = factor(
           SampleOrigin, 
           levels = c("Feed", "Water", "Digesta", "Mucosa",
                      "Mock", "DNA-extraction", "Amplicon-PCR")),
         SampleType = factor(
           SampleType, 
           levels = c("REF-PID", "REF-PIM", "REF-DID", "REF-DIM", 
                      "IM-PID", "IM-PIM", "IM-DID", "IM-DIM", 
                      "Feed", "Water", "Extraction-blank", "PCR-blank", "Mock"))) 

# feature table
otu <- read_qza(here("data/intermediate/qiime2/97otu/table-filtered-97otu-sepp-inserted.qza")) %>%
  pluck("data") %>% 
  as("matrix")

# taxonomy
txnm <- read_qza(here("data/intermediate/qiime2/asv/taxonomy-silva132.qza"))
txnm <- txnm$data %>% 
  as.data.frame() %>%
  mutate(Taxon = gsub("D_0", "k", Taxon), Taxon = gsub("D_1", "p", Taxon),
         Taxon = gsub("D_2", "c", Taxon), Taxon = gsub("D_3", "o", Taxon),
         Taxon = gsub("D_4", "f", Taxon), Taxon = gsub("D_5", "g", Taxon),
         Taxon = gsub("D_6", "s", Taxon)) %>%
  separate(Taxon, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) %>% 
  column_to_rownames("Feature.ID") %>%
  select(-Confidence)

# phylogeny
tree <- read_qza(here("data/intermediate/qiime2/97otu/insertion-tree-97otu.qza"))

# assemble a phyloseq object
ps <- phyloseq(sample_data(mtd), 
               otu_table(otu, taxa_are_rows = TRUE),
               tax_table(as.matrix(txnm)),
               phy_tree(tree$data))

1.3 Data preprocessing

# change OTU names
indx <- formatC(1:ntaxa(ps), width = nchar(ntaxa(ps)), format = "d", flag = "0")
taxa_names(ps) <- paste0("OTU", indx)

# filter data
ps <- subset_samples(ps, !SampleType %in% c("Extraction-blank", "PCR-blank", "Mock")) # remove control samples

# extract metadata, otu table, taxonomy and phylogeny from phyloseq
mtd <- data.frame(sample_data(ps), check.names = FALSE)
otu <- as.data.frame(otu_table(ps)) %>% t()
txnm <- tax_table(ps) %>% as("matrix") %>% as.data.frame()
tree <- phy_tree(ps)

2 Assess batch effect

# Centered log-ratio transformation
otu_clr <- logratio.transfo(otu, logratio = "CLR", offset = 1 ) %>% as("matrix")

# PCA
pca_before <- pca(otu_clr, ncomp = 3)

# PCA plot
pcaPlot_before <- cbind(mtd, pca_before$variates$X) %>%
  ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
  geom_point(size = 2) +
  geom_line(aes(group = SampleName), color = "black") + # connect paired technical replicates by line
  labs(title = "Before batch correction", color = "Sample type", shape = "PCR batch", 
       x = paste0("PC1: ", round(100 * pca_before$explained_variance[1], 2), "%"),
       y = paste0("PC2: ", round(100 * pca_before$explained_variance[2], 2), "%")) +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  theme_bw() 

pcaPlot_before 

From the PCA plot, we can tell that technical replicate samples are far apart. Mucosa samples from fish fed the REF diet form 2 clusters based on PCR batches. There’s a strong PCR batch effect.

3 Correct batch effect

3.1 RUVSeq

RUVSeq removes unwanted variation in RNASeq data using replicate samples and negative control genes. Here we use it for correcting batch effects in microbiome data. The two main parameters of RUVSeq are the number of factors of unwanted variation, k, and the set of negative control variables genes, or OTUs in our case. The choice of the parameter k is not easy and is data-dependent. Empirically, a few (2-3) factors are enough to capture the unwanted variation. In noisy data, K can be increased to 5 or 10. Note that if k is too high, the RUVSeq over-corrects for unwanted variation and ends up removing (almost) all the biological variability within the conditions. The choice of negative control variables is also somewhat data-dependent. However, RUVSeq is robust to the choice of negative control variables. One is recommend to perform extensive exploratory data analysis, comparing different values of k and sets of negative control variables.

# calculate coefficient of variation (cv) for each otu
cv <- t(otu) %>%
  as.data.frame() %>%
  rownames_to_column("featureID") %>%
  mutate(sd   = apply(.[, names(.) != "featureID"], 1, sd),
         mean = apply(.[, names(.) != "featureID"], 1, mean),
         cv   = sd / mean) %>%
  arrange(cv)

# proportions of otus used as negative controls
prp <- c(0.1, 0.2, 0.5, 1) 

# number of unwanted variation factors 
k <- c(1, 2, 3, 4, 5, 10) 

# get combinations of k and prp for parameter tuning
prmt <- expand.grid(prp = prp, k = k)

# sample replicate 
SampleType <- mtd$SampleType
names(SampleType) <- rownames(mtd)
rpl <- makeGroups(SampleType)

# batch effect correction
ruv <- map2(
  prmt$prp,
  prmt$k,
  function(x, y) 
  {
    # get otus showing least variance at defined proportions
    min_cv <- cv[1:round(nrow(cv) * x), ] %>%
      mutate(featureID = paste0("^", featureID, "$")) # exact math of otu ids
   
    # subset otus used as negative controls
    nc <- grepl(paste(min_cv$featureID, collapse = "|"), colnames(otu))
    
    # run RUVs
    ruv <- RUVs(t(otu), nc, y, rpl)
  }
)

# assign names to list elements
names(ruv) <- paste0("ruv_k", prmt$k, "_nc", prmt$prp)

3.2 ComBat-Seq

ComBat-seq is a batch effect correction method for RNASeq data. It is an improved model based on the popular effect correction tool, ComBat.ComBat-seq takes raw count matrix as input. Same as ComBat, it requires a known batch variable.

# PCR batch
PCRBatch <- mtd$PCRBatch
names(PCRBatch) <- rownames(mtd)

# model matrix
mod_mtrx <- model.matrix( ~ SampleType)

# run CombatSeq
cmbt <- ComBat_seq(t(otu), batch = PCRBatch, covar_mod = mod_mtrx) 
## Found 3 batches
## Using null model in ComBat-seq.
## Adjusting for 9 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data

4 Evaluate methods

4.1 Visual inspection: PCA plot

4.1.1 RUVSeq

Extract and log ratio transform batch effect corrected OTU table.

ruv_clr <- purrr::map(ruv, ~pluck(.x, "normalizedCounts")) %>% # get batch corrected count table
  purrr::map(~t(.x)) %>% # transpose normalized count table
  purrr::map(~logratio.transfo(.x, logratio = 'CLR', offset = 1)) %>% # CLR transformation
  purrr::map(~as(.x, "matrix")) 

Run PCA.

pca_ruv <- purrr::map(ruv_clr, ~pca(.x, ncomp = 3)) 

Make PCA plots.

The PCA plots with k = 4, 5 or 10 look promising. Let’s look at one of them in 3d dimensions.

# extract data
k4_nc1 <- cbind(mtd, pca_ruv$ruv_k4_nc1$variates$X)

# 3d PCA plot with plot_ly
plot_ly(
  x = k4_nc1[, "PC1"], y = k4_nc1[, "PC2"], z = k4_nc1[, "PC3"], 
  type = "scatter3d", mode = "markers", color = k4_nc1$SampleType, 
  colors = "Paired") %>% #brewer.pal(n = 12, name = "Paired")
  layout(scene = list(
    xaxis = list(title = paste0('PC1: ', round(pca_ruv$ruv_k4_nc1$explained_variance[1]*100, 2), "%")),
    yaxis = list(title = paste0('PC2: ', round(pca_ruv$ruv_k4_nc1$explained_variance[2]*100, 2), "%")),
    zaxis = list(title = paste0('PC3: ', round(pca_ruv$ruv_k4_nc1$explained_variance[3]*100, 2), "%"))
    )
  )

4.1.2 ComBat-Seq

# centered log-ratio transformation
cmbt_clr <- logratio.transfo(t(cmbt), logratio = 'CLR', offset = 1) %>% 
  as("matrix")

# PCA
pca_cmbt <- pca(cmbt_clr, ncomp = 3)

# make PCA plot
pcaPlot_cmbt <- cbind(mtd, pca_cmbt$variates$X) %>%
  ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
  geom_point(size = 2) +
  geom_line(aes(group = SampleName), color = "black") +
  labs(title = "ComBat-Seq", 
       color = "Sample type", shape = "PCR batch", 
       x = paste0("PC1: ", round(100 * pca_cmbt$explained_variance[1], 2), "%"),
       y = paste0("PC2: ", round(100 * pca_cmbt$explained_variance[2], 2), "%")) +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  theme_bw() 

pcaPlot_cmbt

4.2 Variance explained

4.2.1 Run pRDA

The partial redundancy analysis (pRDA) is a multivariate method to assess globally the effect of treatments and batch. Here, we run pRDA and get variance explained by treatments and batch.

# combine otu table before and after batch corrections in a list
otus <- c(before = list(otu_clr), ruv_clr, combatseq = list(cmbt_clr)) 

# merge otu tables and nest by batch correction methods 
otus_nst <- purrr::map(otus, as.data.frame) %>%
  bind_rows(.id = "batch_correction") %>%
  group_nest(batch_correction, .key = "data")

# define experiment design
design <- data.frame(SampleType = rep(SampleType, length(otus)), 
                     PCRBatch = rep(PCRBatch, length(otus)), 
                     batch_correction = rep(names(otus), each = nrow(mtd)))

design_nst <- group_nest(design, batch_correction, .key = "design")

# run pRDA and extract variance explained by treatment and batch effects
rda <- inner_join(otus_nst, design_nst, by = "batch_correction") %>%
  mutate(rda_trt = map2(data, design, ~rda(.x ~ PCRBatch + Condition(SampleType), data = .y)), 
         rda_bat = map2(data, design, ~rda(.x ~ SampleType + Condition(PCRBatch), data = .y)),
         var_trt = map_dbl(rda_trt, ~.x$pCCA$tot.chi*100/.x$tot.chi),
         var_bat = map_dbl(rda_bat, ~.x$pCCA$tot.chi*100/.x$tot.chi),
         batch_correction = factor(batch_correction, levels = names(otus)))  %>%
  select(batch_correction, var_trt, var_bat)  %>%
  rename(Method = batch_correction, Treatment = var_trt, Batch = var_bat)  %>%
  pivot_longer(Treatment:Batch, names_to = "Type", values_to = "var_expl")  %>%
  mutate(var_expl = round(var_expl, 2))

4.2.2 Plotting

Make bar plots showing variance explained by treatments and batch.

var_expl <- ggplot(rda, aes(x = fct_rev(Method), y = var_expl, fill = Type)) + 
  geom_bar(stat = "identity", position = 'dodge', colour = 'black') + 
  geom_text(aes(Method, var_expl + 5, label = var_expl), size = 3,
            position = position_dodge(width = 0.9)) + 
  coord_flip() +
  scale_y_continuous(limits = c(0, 100), expand = expansion(mult = c(0, 0))) +
  labs(x = "", y = "Variance explained (%)") + 
  theme_bw(base_size = 12) +
  guides(fill = guide_legend(reverse = TRUE)) 

var_expl

4.3 Summary

Based on the PCA plots and partial redundancy analysis (pRDA), we can conclude that the RUVSeq provides better batch effect corrections than the Combat-Seq does. In agreement with previous data, the RUVSeq is robust to the choice of negative control variables. Increasing the number of factors of unwanted variation, k, removes more unwanted variation. Setting k = 4, 5 or 10 yields very similar results. To avoid over-correcting batch effects, we choose the smallest K value, ie., k = 4 and nc = 1.

5 Validate results

Get batch effect corrected OTU table and replace the original OTU table in the phyloseq object.

# extract batch effect corrected OTU table
otu_adj <- ruv$ruv_k4_nc1$normalizedCounts

# replace otu table in the phyloseq object
otu_table(ps) <- otu_table(otu_adj, taxa_are_rows = TRUE)

5.1 Taxonomy

Make a table containing top 10 most abundant taxa at genus level.

taxa_tab <- Summarize.Taxa(otu_adj, txnm)$Genus %>% Make.Percent() 
taxa_tab <- taxa_tab[order(rowMeans(taxa_tab), decreasing = T), ]
Others <- colSums(taxa_tab[11:nrow(taxa_tab), ])
taxa_tab <- rbind(taxa_tab[1:10, ], Others)

Tidy dataframe for making stacked bar plots.

taxa_tab <- as.data.frame(taxa_tab) %>%
  rownames_to_column("Taxa") %>%
  separate(Taxa, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")) %>% 
  mutate(Phylum = ifelse(is.na(Phylum)|Phylum == "NA"|grepl("uncultured|Ambiguous|metagenome", Phylum), 
                         Kingdom, 
                         Phylum),
         Class = ifelse(is.na(Class)|Class == "NA"|grepl("uncultured|Ambiguous|metagenome", Class), 
                        Phylum, 
                        Class),
         Order = ifelse(is.na(Order)|Order == "NA"|grepl("uncultured|Ambiguous|metagenome", Order), 
                        Class, 
                        Order),
         Family = ifelse(is.na(Family)|Family == "NA"|grepl("uncultured|Ambiguous|metagenome", Family), 
                         Order, 
                         Family),
         Genus = ifelse(is.na(Genus)|Genus == "NA"|grepl("uncultured|Ambiguous|metagenome", Genus), 
                        Family, 
                        Genus)) %>%
  select(-Kingdom, -(Class:Family)) %>%
  pivot_longer(-c(Phylum, Genus), names_to = "SampleID", values_to = "Abundance") %>%
  mutate(Phylum = gsub("p__", "", Phylum),
         Genus = gsub("g__", "", Genus),
         Genus = factor(Genus, levels = rev(unique(Genus)))) %>%
  inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID")

Make stacked bar plots.

# define color scheme
col <- c("grey", brewer.pal(n = 10, name = "Paired"))
    
# water samples
taxa_bar_water <- filter(taxa_tab, SampleType == "Water") %>%
  ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  guides(fill=guide_legend(ncol=1)) +
  facet_wrap(~ SampleOrigin) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        strip.background = element_blank(),
        legend.margin = margin(0, 0, 0, 0),
        legend.box.margin = margin(5, -10, 5, -8)) 

# feed samples
taxa_bar_feed <- filter(taxa_tab, SampleType == "Feed") %>%
  ggplot(aes(x = Diet, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(x = "Sample", y = "") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  facet_nested(~ SampleOrigin + Diet, scales = "free_x", nest_line = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "none") 

# samples from fish fed the REF diet 
taxa_bar_ref <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
  filter(grepl("REF-", SampleType)) %>%
  ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "Relative abundance (%)") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position = "none") 

# samples from fish fed the IM diet 
taxa_bar_im <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
  filter(grepl("IM-", SampleType)) %>%
  ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "none") 

# assemble plots
plot_grid(
  taxa_bar_ref, 
  taxa_bar_feed + theme(plot.margin = margin(l = -0.5, unit = "cm")),
  taxa_bar_im + theme(plot.margin = margin(l = -0.4, unit = "cm")),
  taxa_bar_water + theme(plot.margin = margin(l = -0.3, unit = "cm")), 
  nrow =  1, align = 'h', axis = "tb", 
  rel_widths = c(6, 1.5, 6, 4.5)) 

Compared to the taxa barplot (data/intermediate/qiime2/97otu/taxa-bar-plots-filtered-97otu.qzv) before batch correction , the taxonomic composition of samples is very different after the batch correction by RUVSeq.

5.2 Alpha diversity

Compute alpha diversity indices.

# compute Observed features, Peilou's Evenness and Shannon's index
alph_npd <- alpha(ps, index = c("observed", "evenness_pielou", "diversity_shannon")) %>%
  rownames_to_column("SampleID")

# compute Faith's Phylogenetic Diversity
alph_pd <- pd(samp = t(otu_table(ps)), tree = phy_tree(ps), include.root = F) %>%
  select(PD) %>%
  rownames_to_column("SampleID")

# combine data
alph <- inner_join(alph_npd, alph_pd, by = "SampleID") %>%
  inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID") %>%
  rename("Observed OTUs" = observed, "Pielou's evenness" = evenness_pielou, 
         "Shannon's index" = diversity_shannon, "Faith's PD" = PD) %>%
  pivot_longer("Observed OTUs":"Faith's PD", names_to = "alph_indx", values_to = "value") %>%
  mutate(alph_indx = factor(alph_indx, levels = c("Observed OTUs", "Pielou's evenness", 
                                                  "Shannon's index", "Faith's PD")))

Alpha diversity of technical replicates.

filter(alph, IsTechnicalReplicate == "yes") %>%
  ggplot(aes(x = PCRBatch, y = value)) +
  geom_boxplot(aes(color = PCRBatch), width = 0.3) +
  geom_point(aes(color = PCRBatch)) +
  geom_line(aes(group = SampleName), linetype = "dashed") + 
  labs(x = "PCR batch", color = "PCR batch") +
  facet_wrap(~alph_indx, scales = "free_y") +
  scale_fill_brewer(palette = "Dark2") +
  theme_bw() 

Alpha diversity of REF-PIM and REF-DIM samples amplified in different PCR batches.

filter(alph, SampleType %in% c("REF-PIM", "REF-DIM")) %>%
  ggplot(aes(x = SampleType, y = value)) +
  geom_boxplot(width = 0.3) +
  geom_point(aes(color = PCRBatch)) +
  labs(x = "Sample type", color = "PCR batch") +
  facet_wrap(~ alph_indx, scales = "free_y") +
  scale_fill_brewer(palette = "Dark2") +
  theme_bw() 

In line with previous observations, unweighted alpha indices such as Observed OTUs and Faith’s PD are more easily influenced by technical variations than weighted alpha indices like Shannon’s index.

5.3 Beta diversity

OTU table summary.

sampleSum <- colSums(otu_adj) %>% sort()
sampleSum
## AqFl1-134 AqFl1-031 AqFl1-073 AqFl1-087 AqFl1-033 AqFl1-058 AqFl1-083 AqFl1-089 
##       703      1689      1696      1874      1906      1924      1960      1969 
## AqFl1-069 AqFl1-133 AqFl1-062 AqFl1-059 AqFl1-128 AqFl1-024 AqFl1-091 AqFl1-065 
##      1974      1975      2061      2072      2077      2132      2162      2224 
## AqFl1-078 AqFl1-053 AqFl1-020 AqFl1-017 AqFl1-124 AqFl1-126 AqFl1-002 AqFl1-094 
##      2263      2281      2300      2308      2330      2400      2460      2502 
## AqFl1-052 AqFl1-012 AqFl1-061 AqFl1-001 AqFl1-044 AqFl1-004 AqFl1-049 AqFl1-077 
##      2509      2525      2554      2606      2615      2618      2639      2684 
## AqFl1-009 AqFl1-129 AqFl1-021 AqFl1-025 AqFl1-035 AqFl1-014 AqFl1-047 AqFl1-041 
##      2710      2734      2759      2778      2783      2807      2886      2895 
## AqFl1-038 AqFl1-055 AqFl1-029 AqFl1-074 AqFl1-007 AqFl1-063 AqFl1-068 AqFl1-045 
##      2936      2958      3081      3123      3182      3387      3387      3549 
## AqFl1-027 AqFl1-088 AqFl1-072 AqFl1-096 AqFl1-042 AqFl1-015 AqFl1-032 AqFl1-123 
##      3741      3900      3906      3915      3982      4054      4113      4411 
## AqFl1-057 AqFl1-125 AqFl1-095 AqFl1-080 AqFl1-127 AqFl1-084 AqFl1-081 AqFl1-013 
##      4636      4652      4800      4897      4941      4971      5084      5206 
## AqFl1-066 AqFl1-056 AqFl1-076 AqFl1-054 AqFl1-040 AqFl1-043 AqFl1-022 AqFl1-130 
##      5256      5304      5387      5648      5651      5794      5980      6015 
## AqFl1-030 AqFl1-023 AqFl1-008 AqFl1-003 AqFl1-046 AqFl1-086 AqFl1-060 AqFl1-010 
##      6062      6175      6343      6356      6800      6828      6852      7435 
## AqFl1-064 AqFl1-079 AqFl1-028 AqFl1-011 AqFl1-034 AqFl1-070 AqFl1-039 AqFl1-016 
##      7552      7751      7909      8242      8313      8424      8630      8739 
## AqFl1-019 AqFl1-050 AqFl1-092 AqFl1-026 AqFl1-051 AqFl1-082 AqFl1-006 AqFl1-093 
##      9067      9146      9188      9715     10040     10188     10891     11007 
## AqFl1-071 AqFl1-037 AqFl1-075 AqFl1-018 AqFl1-036 AqFl1-090 AqFl1-067 AqFl1-115 
##     11121     11739     11916     12071     12602     12605     13519     14355 
## AqFl1-005 AqFl1-118 AqFl1-048 AqFl1-119 AqFl1-116 AqFl1-120 AqFl1-121 AqFl1-117 
##     14919     16104     17409     31981     35889     37743     38256     42168 
## AqFl1-122 
##     48477

We lost quite a lot of sequences after the batch effect correction. Here we rarefy OTU table at a sub-sampling depth of 1689 sequences.

ps_rf <- rarefy_even_depth(ps, sample.size = sampleSum[2], rngseed = 1910)

5.3.1 Jaccard distance

# pcoa analysis
ord_jac <- ordinate(ps_rf, method = "PCoA", distance = "jaccard")

# ordination plot
plot_ordination(ps_rf, ord_jac, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of Jaccard distance after batch correcton by RUVSeq") +
  theme_bw() 

5.3.2 Bray-Curtis distance

# pcoa analysis
ord_bc <- ordinate(ps_rf, method = "PCoA", distance = "bray")

# ordination plot
plot_ordination(ps_rf, ord_bc, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of Bray-Curtis distance after batch correcton by RUVSeq") +
  theme_bw() 

5.3.3 UniFrac distance

Unweighted UniFrac distance.

# pcoa analysis
ord_uwuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = FALSE)

# ordination plot
plot_ordination(ps_rf, ord_uwuf, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of unweighted UniFrac distance after batch correcton by RUVSeq") +
  theme_bw() 

Weighted UniFrac distance.

# pcoa analysis
ord_wuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = TRUE)

# ordination plot
plot_ordination(ps_rf, ord_wuf, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of weighted UniFrac distance after batch correcton by RUVSeq") +
  theme_bw() 

PcoA plots based on various distance metrics show that batch effect is partly removed after the adjustment by RUVSeq.

6 Conclusion

RUVSeq largely corrected PCR batch effects. However, the batch correction removed a large proportions of sequences in many samples and resulted in very different taxonomic compositions. Hence, we’ll not correct the PCR batch effect but rather analyze the data separately or account for batch effect in the statistical models where applicable.

7 Acknowledgements

I’d like to thank Wang and Lê Cao for publishing the code along with their paper “Managing batch effects in microbiome data”. Part of their code is used for the present analysis.

8 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## system code page: 65001
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2          plotly_4.9.3               
##  [3] ggh4x_0.1.2.1               ggpubr_0.4.0               
##  [5] cowplot_1.1.1               sva_3.38.0                 
##  [7] genefilter_1.72.0           mgcv_1.8-34                
##  [9] RUVSeq_1.24.0               edgeR_3.32.0               
## [11] limma_3.46.0                EDASeq_2.24.0              
## [13] ShortRead_1.48.0            GenomicAlignments_1.26.0   
## [15] SummarizedExperiment_1.20.0 MatrixGenerics_1.2.0       
## [17] matrixStats_0.58.0          Rsamtools_2.6.0            
## [19] GenomicRanges_1.42.0        GenomeInfoDb_1.26.0        
## [21] Biostrings_2.58.0           XVector_0.30.0             
## [23] IRanges_2.24.0              S4Vectors_0.28.0           
## [25] BiocParallel_1.24.1         Biobase_2.50.0             
## [27] BiocGenerics_0.36.0         mixOmics_6.14.0            
## [29] MASS_7.3-53.1               MicrobeR_0.3.2             
## [31] picante_1.8.2               nlme_3.1-149               
## [33] vegan_2.5-7                 lattice_0.20-41            
## [35] permute_0.9-5               ape_5.4-1                  
## [37] microbiome_1.12.0           phyloseq_1.34.0            
## [39] qiime2R_0.99.35             forcats_0.5.1              
## [41] stringr_1.4.0               dplyr_1.0.5                
## [43] purrr_0.3.4                 readr_1.4.0                
## [45] tidyr_1.1.3                 tibble_3.1.0               
## [47] ggplot2_3.3.3               tidyverse_1.3.0            
## [49] here_1.0.1                  knitr_1.31                 
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3         rtracklayer_1.50.0     R.methodsS3_1.8.1     
##   [4] bit64_4.0.5            aroma.light_3.20.0     DelayedArray_0.16.0   
##   [7] R.utils_2.10.1         data.table_1.14.0      rpart_4.1-15          
##  [10] hwriter_1.3.2          RCurl_1.98-1.2         generics_0.1.0        
##  [13] GenomicFeatures_1.42.0 RSQLite_2.2.4          bit_4.0.4             
##  [16] xml2_1.3.2             lubridate_1.7.10       assertthat_0.2.1      
##  [19] xfun_0.22              hms_1.0.0              jquerylib_0.1.3       
##  [22] evaluate_0.14          fansi_0.4.2            progress_1.2.2        
##  [25] dbplyr_2.1.0           readxl_1.3.1           igraph_1.2.6          
##  [28] DBI_1.1.1              htmlwidgets_1.5.3      rARPACK_0.11-0        
##  [31] ellipsis_0.3.1         crosstalk_1.1.1        RSpectra_0.16-0       
##  [34] backports_1.2.1        annotate_1.68.0        biomaRt_2.46.0        
##  [37] vctrs_0.3.6            abind_1.4-5            cachem_1.0.4          
##  [40] withr_2.4.1            checkmate_2.0.0        treeio_1.14.0         
##  [43] prettyunits_1.1.1      cluster_2.1.0          lazyeval_0.2.2        
##  [46] crayon_1.4.1           ellipse_0.4.2          pkgconfig_2.0.3       
##  [49] zCompositions_1.3.4    labeling_0.4.2         nnet_7.3-14           
##  [52] rlang_0.4.10           lifecycle_1.0.0        BiocFileCache_1.14.0  
##  [55] modelr_0.1.8           cellranger_1.1.0       rprojroot_2.0.2       
##  [58] Matrix_1.3-2           aplot_0.0.6            phangorn_2.5.5        
##  [61] carData_3.0-4          Rhdf5lib_1.12.0        reprex_1.0.0          
##  [64] base64enc_0.1-3        png_0.1-7              viridisLite_0.3.0     
##  [67] bitops_1.0-6           R.oo_1.24.0            rhdf5filters_1.2.0    
##  [70] blob_1.2.1             jpeg_0.1-8.1           rstatix_0.7.0         
##  [73] DECIPHER_2.18.1        ggsignif_0.6.1         rtk_0.2.6.1           
##  [76] scales_1.1.1           memoise_2.0.0          magrittr_2.0.1        
##  [79] plyr_1.8.6             zlibbioc_1.36.0        compiler_4.0.3        
##  [82] philr_1.16.0           cli_2.3.1              ade4_1.7-16           
##  [85] patchwork_1.1.1        htmlTable_2.1.0        Formula_1.2-4         
##  [88] tidyselect_1.1.0       stringi_1.5.3          highr_0.8             
##  [91] yaml_2.2.1             askpass_1.1            locfit_1.5-9.4        
##  [94] latticeExtra_0.6-29    ggrepel_0.9.1          grid_4.0.3            
##  [97] sass_0.3.1             fastmatch_1.1-0        tools_4.0.3           
## [100] rio_0.5.26             rstudioapi_0.13        foreach_1.5.1         
## [103] foreign_0.8-81         gridExtra_2.3          farver_2.1.0          
## [106] Rtsne_0.15             digest_0.6.27          rvcheck_0.1.8         
## [109] BiocManager_1.30.10    quadprog_1.5-8         Rcpp_1.0.6            
## [112] car_3.0-10             broom_0.7.5            httr_1.4.2            
## [115] AnnotationDbi_1.52.0   colorspace_2.0-0       rvest_1.0.0           
## [118] XML_3.99-0.5           fs_1.5.0               truncnorm_1.0-8       
## [121] splines_4.0.3          tidytree_0.3.3         multtest_2.46.0       
## [124] xtable_1.8-4           jsonlite_1.7.2         ggtree_2.4.0          
## [127] corpcor_1.6.9          R6_2.5.0               Hmisc_4.5-0           
## [130] NADA_1.6-1.1           pillar_1.5.1           htmltools_0.5.1.1     
## [133] glue_1.4.2             fastmap_1.1.0          DT_0.17               
## [136] codetools_0.2-16       utf8_1.2.1             bslib_0.2.4           
## [139] curl_4.3               zip_2.1.1              openxlsx_4.2.3        
## [142] openssl_1.4.3          survival_3.2-10        rmarkdown_2.7         
## [145] biomformat_1.18.0      munsell_0.5.0          rhdf5_2.34.0          
## [148] GenomeInfoDbData_1.2.4 iterators_1.0.13       haven_2.3.1           
## [151] reshape2_1.4.4         gtable_0.3.0